package com.ruoyi.web.util;

import cn.hutool.core.collection.CollUtil;
import com.ruoyi.constant.SystemConstant;
import com.ruoyi.web.domain.PolicyCrawlArticle;
import com.ruoyi.web.service.PolicyCrawlArticleService;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Slf4j
@Component
public class HtmlParseUtil {

    private static PolicyCrawlArticleService policyCrawlArticleService;

    @Autowired
    public void setArticleService(PolicyCrawlArticleService service) {
        HtmlParseUtil.policyCrawlArticleService = service;
    }

    public static List<PolicyCrawlArticle> getPolicyCrawlArticlePolicyPapersList1() {
        //目标 爬取 山东省工业信息厅-政策文件文章 网站 文章链接
        String url = "http://gxt.shandong.gov.cn/col/col103862/index.html";
        ArrayList<PolicyCrawlArticle> articles = new ArrayList<>();
        int retryCount = 6; // 重试次数
        int delay = 5000; // 重试延迟时间，单位毫秒
        while (retryCount > 0) {
            try {
                //解析网页
                Document document = Jsoup.parse(new URL(url), SystemConstant.NUM_5000);
                Element elementById = document.getElementById("641892");
                // 提取 <script> 标签中的内容
                String scriptContent = elementById.select("script").html();
                // 使用正则表达式提取 <li> 标签
                Pattern pattern = Pattern.compile("<li[^>]*>.*?</li>", Pattern.DOTALL);
                Matcher matcher = pattern.matcher(scriptContent);
                List<String> liTags = new ArrayList<>();
                while (matcher.find()) {
                    liTags.add(matcher.group());
                }
                //去重
                List<String> uniqueLiTags = filterLiTags(liTags);
                for (String liTag : uniqueLiTags) {
                    //转换为 Document 对象
                    Document doc = Jsoup.parse(liTag);
                    //文章链接
                    String articleUrl = doc.getElementsByTag("a").eq(SystemConstant.NUM_ZERO).attr("href");
                    Document articleContext = null;
                    if (articleUrl.contains("http://gxt.shandong.gov.cn/art")) {
                        //进入文章 链接里 获取文章标题、发布时间、图片、浏览次数、图片、文章内容
                        articleContext = Jsoup.parse(new URL(articleUrl), SystemConstant.NUM_5000);
                    } else {
                        //跳出循环
                        continue;
                    }
                    Elements elements = articleContext.select(".headline");
                    //文章标题
                    String articleTitle = elements.select("div.headline h2").text();
                    //首先根据标题查询数据库中是否已存在的文章标题
                    // 检查文章是否已存在
                    PolicyCrawlArticle existingArticle = policyCrawlArticleService.selectByTitle(articleTitle);
                    if (existingArticle != null) {
                        log.info("文章 {} 已经存在,不需要再次添加", articleTitle);
                    } else {
                        //获取文章标签以及内容
                        Elements lable = articleContext.select(".s17");
                        // 删除含有"浏览次数"的 label 标签
                        // 删除包含"浏览次数"文本的 label 标签
                        lable.select("label:has(span:contains(浏览次数))").remove();
                        Elements select = lable.select(".floating");
                        if (select != null) {
                            select.remove();
                        }

                        //最终页面
                        String transformedHtmlLable = lable.toString();
//                        String transformedHtmlLable = transformHtmlNewsByPolicyBusinessNews(lable.toString());
                        // 获取信息来源
                        String infoSources = elements.select("div.headline p span:eq(1)").text().replaceAll("信息来源:", "").trim();
                        //获取发布日期
                        Elements span = lable.select(".headline p span");
                        Elements date = span.select("span:contains(发布日期)");
                        String dateStr = date.text();
                        String publishDateStr = dateStr.substring("发布日期：".length()).trim();
                        //转换为Date类型
                        Date publishDate;
                        try {
                            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm");
                            LocalDateTime localDateTime = LocalDateTime.parse(publishDateStr, formatter);
                            publishDate = Date.from(localDateTime.atZone(ZoneId.systemDefault()).toInstant());
                        } catch (DateTimeParseException e) {
                            // 如果 DateTimeFormatter 无法正确解析,则尝试使用 SimpleDateFormat
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                                publishDate = sdf.parse(publishDateStr);
                            } catch (ParseException pe) {
                                // 如果仍然无法解析,则使用当前时间作为发布日期
                                publishDate = new Date();
                            }
                        }
                        PolicyCrawlArticle article = new PolicyCrawlArticle()
                                .setArticleTitle(articleTitle)
                                .setContentLabel(transformedHtmlLable.replace("src=\"/", "src=\"http://gxt.shandong.gov.cn/").replace("href=\"", "href=\"http://gxt.shandong.gov.cn"))
                                .setInfoSource(infoSources)
                                .setProvince("F2557678-A421-4E33-9F18-B93FDB3C86DC")//山东省id
                                .setPublishDate(publishDate)
                                .setType("政策资讯")
                                .setCreateTime(new Date())
                                .setClassification("规范性文件")
                                .setViewCount(new Random().nextInt(128) + 23);
                        articles.add(article);
                    }
                }
                articles = CollUtil.distinct(articles);
                articles.forEach(toSaveArticle ->
                {
                    log.info("文章" + toSaveArticle);
                });
                break;
            } catch (IOException | NullPointerException e) {
                log.info("文章爬取失败====={}====", e);
                retryCount--;
                if (retryCount > 0) {
                    try {
                        Thread.sleep(delay); // 延迟一段时间再重试
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                    }
                } else {
                    log.error("文章爬取重试次数已用完,放弃爬取");
                }
            }
        }
        return articles;
    }

    public static List<PolicyCrawlArticle> getPolicyCrawlArticlePolicyPapersList() {
        //目标 爬取 山东省工业信息厅-政策文件文章 网站 文章链接
        String url = "http://gxt.shandong.gov.cn/col/col103863/index.html";
        ArrayList<PolicyCrawlArticle> articles = new ArrayList<>();
        int retryCount = 6; // 重试次数
        int delay = 5000; // 重试延迟时间，单位毫秒
        while (retryCount > 0) {
            try {
                //解析网页
                Document document = Jsoup.parse(new URL(url), SystemConstant.NUM_5000);
                Element elementById = document.getElementById("641892");
                // 提取 <script> 标签中的内容
                String scriptContent = elementById.select("script").html();
                // 使用正则表达式提取 <li> 标签
                Pattern pattern = Pattern.compile("<li[^>]*>.*?</li>", Pattern.DOTALL);
                Matcher matcher = pattern.matcher(scriptContent);
                List<String> liTags = new ArrayList<>();
                while (matcher.find()) {
                    liTags.add(matcher.group());
                }
                //去重
                List<String> uniqueLiTags = filterLiTags(liTags);
                for (String liTag : uniqueLiTags) {
                    //转换为 Document 对象
                    Document doc = Jsoup.parse(liTag);
                    //文章链接
                    String articleUrl = doc.getElementsByTag("a").eq(SystemConstant.NUM_ZERO).attr("href");
                    Document articleContext = null;
                    if (articleUrl.contains("http://gxt.shandong.gov.cn/art")) {
                        //进入文章 链接里 获取文章标题、发布时间、图片、浏览次数、图片、文章内容
                        articleContext = Jsoup.parse(new URL(articleUrl), SystemConstant.NUM_5000);
                    } else {
                        //跳出循环
                        continue;
                    }
                    Elements elements = articleContext.select(".headline");
                    //文章标题
                    String articleTitle = elements.select("div.headline h2").text();
                    //首先根据标题查询数据库中是否已存在的文章标题
                    // 检查文章是否已存在
                    PolicyCrawlArticle existingArticle = policyCrawlArticleService.selectByTitle(articleTitle);
                    if (existingArticle != null) {
                        log.info("文章 {} 已经存在,不需要再次添加", articleTitle);
                    } else {
                        //获取文章标签以及内容
                        Elements lable = articleContext.select(".s17");
                        // 删除含有"浏览次数"的 label 标签
                        // 删除包含"浏览次数"文本的 label 标签
                        lable.select("label:has(span:contains(浏览次数))").remove();
                        Elements select = lable.select(".floating");
                        if (select != null) {
                            select.remove();
                        }

                        //最终页面
                        String transformedHtmlLable = lable.toString();
//                        String transformedHtmlLable = transformHtmlNewsByPolicyBusinessNews(lable.toString());
                        // 获取信息来源
                        String infoSources = elements.select("div.headline p span:eq(1)").text().replaceAll("信息来源:", "").trim();
                        //获取发布日期
                        Elements span = lable.select(".headline p span");
                        Elements date = span.select("span:contains(发布日期)");
                        String dateStr = date.text();
                        String publishDateStr = dateStr.substring("发布日期：".length()).trim();
                        //转换为Date类型
                        Date publishDate;
                        try {
                            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm");
                            LocalDateTime localDateTime = LocalDateTime.parse(publishDateStr, formatter);
                            publishDate = Date.from(localDateTime.atZone(ZoneId.systemDefault()).toInstant());
                        } catch (DateTimeParseException e) {
                            // 如果 DateTimeFormatter 无法正确解析,则尝试使用 SimpleDateFormat
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                                publishDate = sdf.parse(publishDateStr);
                            } catch (ParseException pe) {
                                // 如果仍然无法解析,则使用当前时间作为发布日期
                                publishDate = new Date();
                            }
                        }
                        PolicyCrawlArticle article = new PolicyCrawlArticle()
                                .setArticleTitle(articleTitle)
                                .setContentLabel(transformedHtmlLable.replace("src=\"/", "src=\"http://gxt.shandong.gov.cn/").replace("href=\"", "href=\"http://gxt.shandong.gov.cn"))
                                .setInfoSource(infoSources)
                                .setProvince("F2557678-A421-4E33-9F18-B93FDB3C86DC")//山东省id
                                .setPublishDate(publishDate)
                                .setType("政策资讯")
                                .setCreateTime(new Date())
                                .setClassification("其他文件")
                                .setViewCount(new Random().nextInt(128) + 23);
                        articles.add(article);
                    }
                }
                articles = CollUtil.distinct(articles);
                articles.forEach(toSaveArticle ->
                {
                    log.info("文章" + toSaveArticle);
                });
                break;
            } catch (IOException | NullPointerException e) {
                log.info("文章爬取失败====={}====", e);
                retryCount--;
                if (retryCount > 0) {
                    try {
                        Thread.sleep(delay); // 延迟一段时间再重试
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                    }
                } else {
                    log.error("文章爬取重试次数已用完,放弃爬取");
                }
            }
        }
        return articles;
    }

    public static List<PolicyCrawlArticle> getPolicyCrawlArticlePolicyPapersList2() {
        //目标 爬取 山东省工业信息厅-政策文件文章 网站 文章链接
        String url = "http://gxt.shandong.gov.cn/col/col103864/index.html";
        ArrayList<PolicyCrawlArticle> articles = new ArrayList<>();
        int retryCount = 6; // 重试次数
        int delay = 5000; // 重试延迟时间，单位毫秒
        while (retryCount > 0) {
            try {
                //解析网页
                Document document = Jsoup.parse(new URL(url), SystemConstant.NUM_5000);
                Element elementById = document.getElementById("641892");
                // 提取 <script> 标签中的内容
                String scriptContent = elementById.select("script").html();
                // 使用正则表达式提取 <li> 标签
                Pattern pattern = Pattern.compile("<li[^>]*>.*?</li>", Pattern.DOTALL);
                Matcher matcher = pattern.matcher(scriptContent);
                List<String> liTags = new ArrayList<>();
                while (matcher.find()) {
                    liTags.add(matcher.group());
                }
                //去重
                List<String> uniqueLiTags = filterLiTags(liTags);
                for (String liTag : uniqueLiTags) {
                    //转换为 Document 对象
                    Document doc = Jsoup.parse(liTag);
                    //文章链接
                    String articleUrl = doc.getElementsByTag("a").eq(SystemConstant.NUM_ZERO).attr("href");
                    Document articleContext = null;
                    if (articleUrl.contains("http://gxt.shandong.gov.cn/art")) {
                        //进入文章 链接里 获取文章标题、发布时间、图片、浏览次数、图片、文章内容
                        articleContext = Jsoup.parse(new URL(articleUrl), SystemConstant.NUM_5000);
                    } else {
                        //跳出循环
                        continue;
                    }
                    Elements elements = articleContext.select(".headline");
                    //文章标题
                    String articleTitle = elements.select("div.headline h2").text();
                    //首先根据标题查询数据库中是否已存在的文章标题
                    // 检查文章是否已存在
                    PolicyCrawlArticle existingArticle = policyCrawlArticleService.selectByTitle(articleTitle);
                    if (existingArticle != null) {
                        log.info("文章 {} 已经存在,不需要再次添加", articleTitle);
                    } else {
                        //获取文章标签以及内容
                        Elements lable = articleContext.select(".s17");
                        // 删除含有"浏览次数"的 label 标签
                        // 删除包含"浏览次数"文本的 label 标签
                        lable.select("label:has(span:contains(浏览次数))").remove();
                        Elements select = lable.select(".floating");
                        if (select != null) {
                            select.remove();
                        }

                        //最终页面
                        String transformedHtmlLable = lable.toString();
//                        String transformedHtmlLable = transformHtmlNewsByPolicyBusinessNews(lable.toString());
                        // 获取信息来源
                        String infoSources = elements.select("div.headline p span:eq(1)").text().replaceAll("信息来源:", "").trim();
                        //获取发布日期
                        Elements span = lable.select(".headline p span");
                        Elements date = span.select("span:contains(发布日期)");
                        String dateStr = date.text();
                        String publishDateStr = dateStr.substring("发布日期：".length()).trim();
                        //转换为Date类型
                        Date publishDate;
                        try {
                            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm");
                            LocalDateTime localDateTime = LocalDateTime.parse(publishDateStr, formatter);
                            publishDate = Date.from(localDateTime.atZone(ZoneId.systemDefault()).toInstant());
                        } catch (DateTimeParseException e) {
                            // 如果 DateTimeFormatter 无法正确解析,则尝试使用 SimpleDateFormat
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                                publishDate = sdf.parse(publishDateStr);
                            } catch (ParseException pe) {
                                // 如果仍然无法解析,则使用当前时间作为发布日期
                                publishDate = new Date();
                            }
                        }
                        PolicyCrawlArticle article = new PolicyCrawlArticle()
                                .setArticleTitle(articleTitle)
                                .setContentLabel(transformedHtmlLable.replace("src=\"/", "src=\"http://gxt.shandong.gov.cn/").replace("href=\"", "href=\"http://gxt.shandong.gov.cn"))
                                .setInfoSource(infoSources)
                                .setProvince("F2557678-A421-4E33-9F18-B93FDB3C86DC")//山东省id
                                .setPublishDate(publishDate)
                                .setType("政策资讯")
                                .setCreateTime(new Date())
                                .setClassification("规范性文件清理")
                                .setViewCount(new Random().nextInt(128) + 23);
                        articles.add(article);
                    }
                }
                articles = CollUtil.distinct(articles);
                articles.forEach(toSaveArticle ->
                {
                    log.info("文章" + toSaveArticle);
                });
                break;
            } catch (IOException | NullPointerException e) {
                log.info("文章爬取失败====={}====", e);
                retryCount--;
                if (retryCount > 0) {
                    try {
                        Thread.sleep(delay); // 延迟一段时间再重试
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                    }
                } else {
                    log.error("文章爬取重试次数已用完,放弃爬取");
                }
            }
        }
        return articles;
    }

    public static List<PolicyCrawlArticle> getPolicyCrawlArticlePolicyPapersList3() {
        //目标 爬取 山东省工业信息厅-政策文件文章 网站 文章链接
        String url = "http://gxt.shandong.gov.cn/col/col103865/index.html";
        ArrayList<PolicyCrawlArticle> articles = new ArrayList<>();
        int retryCount = 6; // 重试次数
        int delay = 5000; // 重试延迟时间，单位毫秒
        while (retryCount > 0) {
            try {
                //解析网页
                Document document = Jsoup.parse(new URL(url), SystemConstant.NUM_5000);
                Element elementById = document.getElementById("641892");
                // 提取 <script> 标签中的内容
                String scriptContent = elementById.select("script").html();
                // 使用正则表达式提取 <li> 标签
                Pattern pattern = Pattern.compile("<li[^>]*>.*?</li>", Pattern.DOTALL);
                Matcher matcher = pattern.matcher(scriptContent);
                List<String> liTags = new ArrayList<>();
                while (matcher.find()) {
                    liTags.add(matcher.group());
                }
                //去重
                List<String> uniqueLiTags = filterLiTags(liTags);
                for (String liTag : uniqueLiTags) {
                    //转换为 Document 对象
                    Document doc = Jsoup.parse(liTag);
                    //文章链接
                    String articleUrl = doc.getElementsByTag("a").eq(SystemConstant.NUM_ZERO).attr("href");
                    Document articleContext = null;
                    if (articleUrl.contains("http://gxt.shandong.gov.cn/art")) {
                        //进入文章 链接里 获取文章标题、发布时间、图片、浏览次数、图片、文章内容
                        articleContext = Jsoup.parse(new URL(articleUrl), SystemConstant.NUM_5000);
                    } else {
                        //跳出循环
                        continue;
                    }
                    Elements elements = articleContext.select(".headline");
                    //文章标题
                    String articleTitle = elements.select("div.headline h2").text();
                    //首先根据标题查询数据库中是否已存在的文章标题
                    // 检查文章是否已存在
                    PolicyCrawlArticle existingArticle = policyCrawlArticleService.selectByTitle(articleTitle);
                    if (existingArticle != null) {
                        log.info("文章 {} 已经存在,不需要再次添加", articleTitle);
                    } else {
                        //获取文章标签以及内容
                        Elements lable = articleContext.select(".s17");
                        // 删除含有"浏览次数"的 label 标签
                        // 删除包含"浏览次数"文本的 label 标签
                        lable.select("label:has(span:contains(浏览次数))").remove();
                        Elements select = lable.select(".floating");
                        if (select != null) {
                            select.remove();
                        }

                        //最终页面
                        String transformedHtmlLable = lable.toString();
//                        String transformedHtmlLable = transformHtmlNewsByPolicyBusinessNews(lable.toString());
                        // 获取信息来源
                        String infoSources = elements.select("div.headline p span:eq(1)").text().replaceAll("信息来源:", "").trim();
                        //获取发布日期
                        Elements span = lable.select(".headline p span");
                        Elements date = span.select("span:contains(发布日期)");
                        String dateStr = date.text();
                        String publishDateStr = dateStr.substring("发布日期：".length()).trim();
                        //转换为Date类型
                        Date publishDate;
                        try {
                            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm");
                            LocalDateTime localDateTime = LocalDateTime.parse(publishDateStr, formatter);
                            publishDate = Date.from(localDateTime.atZone(ZoneId.systemDefault()).toInstant());
                        } catch (DateTimeParseException e) {
                            // 如果 DateTimeFormatter 无法正确解析,则尝试使用 SimpleDateFormat
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                                publishDate = sdf.parse(publishDateStr);
                            } catch (ParseException pe) {
                                // 如果仍然无法解析,则使用当前时间作为发布日期
                                publishDate = new Date();
                            }
                        }
                        PolicyCrawlArticle article = new PolicyCrawlArticle()
                                .setArticleTitle(articleTitle)
                                .setContentLabel(transformedHtmlLable.replace("src=\"/", "src=\"http://gxt.shandong.gov.cn/").replace("href=\"", "href=\"http://gxt.shandong.gov.cn"))
                                .setInfoSource(infoSources)
                                .setProvince("F2557678-A421-4E33-9F18-B93FDB3C86DC")//山东省id
                                .setPublishDate(publishDate)
                                .setType("政策资讯")
                                .setCreateTime(new Date())
                                .setClassification("政策解读")
                                .setViewCount(new Random().nextInt(128) + 23);
                        articles.add(article);
                    }
                }
                articles = CollUtil.distinct(articles);
                articles.forEach(toSaveArticle ->
                {
                    log.info("文章" + toSaveArticle);
                });
                break;
            } catch (IOException | NullPointerException e) {
                log.info("文章爬取失败====={}====", e);
                retryCount--;
                if (retryCount > 0) {
                    try {
                        Thread.sleep(delay); // 延迟一段时间再重试
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                    }
                } else {
                    log.error("文章爬取重试次数已用完,放弃爬取");
                }
            }
        }
        return articles;
    }

    /**
     * 去重方法
     */
    public static List<String> filterLiTags(List<String> liTags) {
        List<String> uniqueLiTags = new ArrayList<>();
        Set<String> seenTitles = new HashSet<>();
        for (String liTag : liTags) {
            // 使用正则表达式提取标题
            Matcher titleMatcher = Pattern.compile("<a.*?>(.*?)</a>").matcher(liTag);
            if (titleMatcher.find()) {
                String title = titleMatcher.group(1);
                // 如果标题没有被记录过，则添加到结果列表
                if (!seenTitles.contains(title)) {
                    uniqueLiTags.add(liTag);
                    seenTitles.add(title);
                }
            }
        }
        return uniqueLiTags;
    }
}